Fig4 Expansion Load
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
=== Fig4A ===
Fig4A: DAF-plot by loadGroup deleterious
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz")
## Rows: 3018785 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sub, LoadGroup, Group, GenomeType
## dbl (3): Chr, Pos, AAF
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### step2: DAF bin
df1 <- df %>%
rename(DAF=AAF) %>%
filter(DAF!=0) %>%
filter(DAF!=1) %>%
mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
# mutate(MergedSub=if_else(Sub=="D","D","AB")) %>%
group_by(Group,GenomeType,LoadGroup,Sub) %>%
count(bin) %>%
mutate(fre=n/sum(n))
### step3: filter dataframe
df2 <- df1 %>%
filter(GenomeType=="AABBDD") %>%
filter(Sub == "D") %>%
filter(LoadGroup=="Deleterious") %>%
droplevels()
### step4: set factor levels
levels_A <- c("LR_EA","OtherHexaploids","Cultivar","LR_EU","Tibetan semi-wild","LR_WA","LR_AM","LR_AF","LR_CSA","Spelt","Yunan wheat","Macha","Club wheat","Vavilovii","Indian dwarf wheat","Xinjiang wheat")
df2$Group <- factor(df2$Group,levels = levels_A)
### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
dfhash <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/021_WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GroupbyContinent,Subspecies_by22_TreeValidated,GenomeType) %>%
mutate(GroupforExpansionLoad = if_else(Subspecies_by22_TreeValidated=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>%
select(GenomeType,GroupforExpansionLoad) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
distinct(.,GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 每个亚群的颜色信息
dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>%
filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>%
left_join(.,dfhash,by=c("Group"="GroupforExpansionLoad")) %>%
### 过滤数据,使之和输入的数据框保持一致
filter(GenomeType =="AABBDD") %>%
### 这里是个重点,我想自定义顺序!
mutate(Label=factor(Label,levels = levels_A)) %>%
arrange(as.factor(Label))
## Rows: 74 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Group, Label, ColorCode, ColorCategory
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 设置数据框和画图颜色的因子顺序,使都保持一致
df1$Group <- factor(df1$Group,levels = dfcolorDB$Label)
colB <- dfcolorDB$ColorCode
#### facet by sub
# colB <- c("#e69f00","#004680")
p <- df2 %>%
# filter(Sub %in% c("A","B")) %>%
# filter(GenomeType=="AABBDD") %>%
# filter(LoadGroup=="Deleterious") %>%
# filter(Sub == "D") %>%
# filter(bin %in% c("[0,0.1]","(0.1,0.2]")) %>%
ggplot(aes(x=bin,y=fre,group=Group,color=Group))+
geom_line(alpha=0.9)+
labs(y="Proportion of deleterious SNPs",x="Derived allele frequency")+
scale_color_manual(values = colB)+
# scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
# scale_fill_manual(values = colB)+
# scale_y_continuous(limits = c(0,1))+
# scale_x_discrete(label=c("0.1","","0.3","","0.5","","0.7","","0.9",""))+
# facet_grid(Sub~Group)+
theme(strip.background = element_blank())+
# theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
theme(
# plot.margin=unit(rep(1,4),'lines'),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
legend.key = element_blank(),
legend.position = 'none',
text = element_text(size = 6))
p

# ggsave(p,filename = "~/Documents/temp.png",height = 6,width = 8)
library("ggpubr")
ggp_legend <- get_legend(p)
ggsave(as_ggplot(ggp_legend),filename = "~/Documents/legend.pdf",height = 4,width = 1.5)
# ggsave(p,filename = "~/Documents/test.pdf",height = 3.2,width = 3.2) ### 刚好可以把所有图例展示出来
ggsave(p,filename = "~/Documents/test.pdf",height = 2.28,width = 2.28) ### 刚好可以把所有图例展示出来
=== Fig4B ===
ThetaW only hexaploid
### 开始处理 pi 的结果
dfori <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
### 添加过滤条件
filter(GenomeType=="AABBDD")
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>%
filter(tW_mean!=0) %>%
mutate(Sub=str_sub(RefChr,2,2)) %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2")) %>%
### 添加过滤条件
filter(GenomeType=="AABBDD",Sub=="D")
## Rows: 469 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): RefChr, Group
## dbl (10): tW_mean, tP_mean, tF_mean, tH_mean, tL_mean, Tajima_mean, fuf_mean...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(tW_mean)) %>%
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="D") %>%
select(GroupforExpansionLoad)
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
# add_row(GroupforExpansionLoad="Strangulata")
### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 30
p <- df %>%
ggplot(aes(x=GroupforExpansionLoad,y=tW_mean))+
labs(y= expression(italic(theta)[italic(W)]), x = "")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
# geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
# geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
# geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
# geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=1-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=8-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
# geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
# aes(color=Sub),
color="black",
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="red",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 7.5,linetype="dashed",color="gray")+
# geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
# scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
# theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 7))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 2.12,width = 4.23)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 2,width = 3.5)
=== Fig4CDE===
World map
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Longitude,Latitude)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>%
filter(GenomeType=="AABBDD") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
filter(Stand_LoadGroup == choiceA) %>%
select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) ### 802个个体 -> 543 个有经纬度信息
mp<-NULL #定义一个空的地图
# 注意: borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()
colB <- c("#ffd702","#fc6e6e","#87cef9")
mp<-mpvoid +
geom_point(data=df,aes(x=Longitude,y=Latitude
),shape=21,alpha=0.8,size=2,fill="orange")+
theme(legend.position = 'none')
mp

ggsave(mp,filename = "figs/map_s1062_1.pdf", height = 2.76,width = 6.89)
Bubble map
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Longitude,Latitude)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>%
filter(GenomeType=="AABBDD") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
filter(Stand_LoadGroup == choiceA) %>% ###******* choice
select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) %>% ### 802个个体 -> 543 个有经纬度信息
arrange(Stand_delCount) #### 排序
mp<-NULL #定义一个空的地图
# 注意: borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()
library(viridis)
## Loading required package: viridisLite
mp<-mpvoid +
geom_point(data = df,mapping = aes(x=Longitude,y=Latitude,
size=Stand_delCount, color=Stand_delCount),
alpha = 0.9)+
scale_size_continuous(range=c(1,3))+ ## 0.5-3
# scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))+
# scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
# scale_fill_gradientn(colours =rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
scale_color_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))
# scale_color_viridis(trans="log") +
# scale_fill_viridis(trans="log")
# theme(legend.position = 'none')
mp

ggsave(mp,filename = "figs/map_s1062_2.pdf", height = 2.76,width = 6.89)
Spatstat world map
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Longitude,Latitude)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
df1 <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>%
filter(GenomeType=="AABBDD") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
filter(Stand_LoadGroup == choiceA) %>%
select(-IfSelected,-Group_refobj) %>%
select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>%
filter(!is.na(Longitude)) %>% ### 802个个体 -> 543 个有经纬度信息
arrange(Stand_delCount)
df <- df1 %>%
mutate(Longitude = Longitude + runif(nrow(df1),0.001,0.005)) ##通过添加随机数,避免重复
plot
# library(raster)
# # detach(package:raster,unload = T)
# library(spatstat)
# library(RColorBrewer)
#
# ppp.df <- ppp(df$Longitude, df$Latitude, c(-180, 180), c(-60, 90))
# marks(ppp.df) <- df[, 2] ### load ratio 做为 marks 信息
# # plot(density(ppp.df)) ###画出点的聚集程度 测试
# ppp.df2 <- Smooth(ppp.df) ### 求每个点对应的load值的大小
# ppp.df2.m <- ppp.df2$v ### 获取矩阵内的密度数据
# ###矩阵上下颠倒128 -> 1 ,依次减一
# ppp.df2.m1 <- ppp.df2.m[seq(128, 1, by = -1), ]
#
# ### 创建 raster 类,并将密度的结果填充进去
# df.raster <- raster(nrows = 128, ncols = 128, xmn = -180, xmx = 180, ymn = -60, ymx = 90)
# values(df.raster) <- as.vector(t(as.matrix(ppp.df2.m1)))
# plot(df.raster)
#
# world.shp <- shapefile("data/002_spatstat/spatstat/shapefile/TM_WORLD_BORDERS_SIMPL-0.3/TM_WORLD_BORDERS_SIMPL-0.3.shp")
# df.raster.final <- mask(df.raster, world.shp)
# par(mfrow = c(1,1))
#
# # pdf(file = str_c("figs/Fig4/misc",choiceA,"_wholeWorld.pdf"),height = 3,width = 4)
# ### whole world
# png(filename = str_c("~/Documents/",choiceA,"_wholemap.png"),units = 'in',height = 4,width = 5.3,res = 300)
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.5,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(-150,180,60))
# axis(2,at=seq(-90,90,60))
# dev.off()
#
# ### 亚欧大陆和非洲
# png(filename =str_c("~/Documents/",choiceA,"_Eurasia.png"),units = 'in',height = 4,width = 5.3,res = 300)
# # pdf( "~/Documents/test.pdf")
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-20,150),ylim=c(-35,62),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(0,150,30))
# axis(2,at=seq(-20,62,20))
# # axis(1,at=seq(0,150,30),labels = c("0","","","","",""))
# # axis(2,at=seq(0,62,20))
# dev.off()
#
# ### 美洲和非洲
# png(filename = str_c("~/Documents/",choiceA,"_American.png"),units = 'in',height = 4,width = 5.3,res = 300)
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-150,60),ylim=c(-60,62),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(-150,60,30))
# axis(2,at=seq(-60,62,30))
# dev.off()
=== Fig4F===
boxplot load
- 可以选择定义有害突变的方式 unique(dfdel$Stand_LoadGroup)
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### ********************** Section1 *********************************###
### taxa 属性数据框
factorA <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata")
labelsA <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata")
dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA)
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>%
mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### ********************** Section2 *********************************###
### del load 数据框
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>%
filter(Subgenome=="B") %>%
filter(DelCountGroup=="TotalDerivedSNPCount") %>%
filter(GenomeType=="AABBDD") %>%
select(-c(DelCountGroup,IfSelected,Group_refobj,GenomeType)) %>%
filter(!is.na(GroupforExpansionLoad))
### 统计每个亚基因组内,以亚洲为单位的均值,按照D亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
summarise(mean = mean(Stand_delCount)) %>%
arrange(.,Subgenome,Stand_LoadGroup,mean) %>%
ungroup() %>%
filter(Subgenome=="B",Stand_LoadGroup== choiceA) %>%
select(GroupforExpansionLoad)
## `summarise()` has grouped output by 'Subgenome', 'Stand_LoadGroup'. You can override using the `.groups` argument.
### 因子排序,排序后计数
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数
### 千万注意数据框的分组情况,如sub VariantsGroup ,计算后核查确认!!!
dftaxaCount <- df %>%
group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
count() %>% ungroup() %>%
distinct(.,GroupforExpansionLoad,.keep_all = T) %>%
select(-Subgenome) %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
plot
library(RColorBrewer)
p <- df %>%
filter(Stand_LoadGroup==choiceA) %>%
ggplot(aes(x=GroupforExpansionLoad,y= Stand_delCount))+
labs(y=str_c("Mutation load per accession\nby ",choiceA),x="")+
geom_boxplot(aes(fill=GroupforExpansionLoad), outlier.color =NA,alpha=0.6)+ ### width=0.1
geom_point(position = position_jitterdodge(0.6),alpha = 0.4,aes(color=Subgenome),size=0.2)+
stat_summary(aes(group = Subgenome), fun=mean, geom="point", color="blue",position = position_dodge(1),size=0.7)+
scale_color_manual(values =c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_fill_manual(values = colorRampPalette(rev(brewer.pal(n = 1, name = "RdBu")))(16))+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(strip.background = element_blank())+
# theme(plot.margin=unit(rep(1.3,4),'lines'))+
coord_flip()+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.4),
text = element_text(size = 7),legend.position = 'none')
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels

### 竖版
# size=7
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.2,width = 2.6)
### 横版
# size=14
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.2,width = 2.6)
=== Fig4G===
TajimaD
df <- read_tsv("data/014_PopParameters/003_angsd/001_angsd_subspecies26_geneRegion_RefChr.txt.gz") %>%
mutate(Sub=str_sub(RefChr,2))
### taxa 属性数据框
factorA <- c("OtherHexaploids","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata", "LR_WA","LR_EA","LR_EU","LR_CSA","LR_AM","LR_AF")
labelsA <- c("OtherHexaploids","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata","LR_WA","LR_EA","LR_EU","LR_CSA","LR_AM","LR_AF")
dfhash <- tibble(Group=factorA,Subspecies22=labelsA)
### 得到3列数据,GenomeType,GroupforExpansionLoad,Group
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
mutate(Group=if_else(Subspecies_by22_TreeValidated=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>%
left_join(.,dfhash,by="Group") %>%
mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>%
select(GenomeType,GroupforExpansionLoad,Group) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
### ********************** Section2 *********************************###
###
df <- df %>% left_join(.,dftaxaDB,by="Group")
plot density_ridges TajimaD
fun_getLoadOrder <- function(){
### ********************** Section1 *********************************###
### 求作图的顺序和load一致
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
### taxa 属性数据框
factorA <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata")
labelsA <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata")
dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA)
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>%
mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad)
### del load 数据框
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup=="TotalDerivedSNPCount") %>%
filter(GenomeType=="AABBDD") %>%
select(-c(DelCountGroup,IfSelected,Group_refobj,GenomeType)) %>%
filter(!is.na(GroupforExpansionLoad))
### 统计每个亚基因组内,以亚洲为单位的均值,按照D亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
summarise(mean = mean(Stand_delCount)) %>%
arrange(.,Subgenome,Stand_LoadGroup,mean) %>%
ungroup() %>%
filter(Subgenome=="D",Stand_LoadGroup== choiceA) %>%
select(GroupforExpansionLoad)
order <- dfV1$GroupforExpansionLoad
### 因子排序,排序后计数
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = order)
return(order)
}
labelA <- fun_getLoadOrder()
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'Subgenome', 'Stand_LoadGroup'. You can override using the `.groups` argument.
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = labelA)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggridges)
### ********************** Section2 *********************************###
df1 <- df %>%
filter(Sub=="D") %>%
filter(GenomeType=="AABBDD") %>%
select(Chr,WinCenter,tW,tP,Tajima,nSites,GroupforExpansionLoad,RefChr,RefPos,Sub) %>%
# filter(tW>0) %>%
filter(!is.na(Tajima)) %>%
mutate(num=as.numeric(factor(GroupforExpansionLoad)))
p <-
# ggplot(df1,aes(x=Tajima,y=GroupforExpansionLoad,fill = 0.5 - abs(0.5 - stat(ecdf))))+
ggplot(df1,aes(x=Tajima,y=GroupforExpansionLoad,fill = stat(ecdf)))+
# stat_density_ridges(quantile_lines = TRUE, quantiles = 2
# ,scale=4,alpha=0.6, color='gray'
# )+
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE,scale=4,alpha=0.6, color='gray') +
# geom_density_ridges_gradient(scale = 3,size=0.3) +
scale_fill_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
# scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
# scale_y_discrete(expand = c(0.01, 0)) +
# scale_x_continuous(expand = c(0, 0),limits = c(-2,4),breaks = seq(-2,4,2))+
coord_cartesian(xlim = c(-1.5,4))+
# theme_ridges(grid = FALSE,center_axis_labels = TRUE)+
# theme_pubr()+
# labs(x= expression(paste("Tajima's ",italic(D))), y = "Group number")+
labs(x= expression(paste("Tajima's ",italic(D))), y = "")+
geom_vline(xintercept = 0, linetype="dotted", color = "blue")+
theme(strip.background = element_blank())+
theme(panel.grid= element_blank(),
panel.background = element_blank(),
axis.line = element_line(size=0.4, colour = "black"),
text = element_text(size = 14.5),legend.position = 'none')
p
## Picking joint bandwidth of 0.115

### AI size=7
ggsave(p,filename = "~/Documents/test.pdf",height = 3.9,width = 1.96)
## Picking joint bandwidth of 0.115
### PPT size=14.5
ggsave(p,filename = "~/Documents/test.pdf",height = 6,width = 6)
## Picking joint bandwidth of 0.115
=======================
==== Extended Data ====
=======================
======
****Map by subspecies
### ********************** Section1 *********************************###
### taxa 属性数据框
factorA <- c("dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","strangulata")
labelsA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA)
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent,Latitude,Longitude,orgCty,Country) %>%
left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>%
mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>%
mutate(GroupforExpansionLoad2=
if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Latitude,Longitude,orgCty,Country)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# write_tsv(dftaxaDB,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/002_taxaDB_GroupforExpansionLoad.txt")
### ********************** Section 2 *********************************###
### 先将有经纬度信息的材料进行画图查看,将异常点的位置信息进行校正
dftaxaDB2 <- dftaxaDB %>%
filter(!is.na(Latitude))
### 写出没有经纬度信息的336份材料,根据国家添加人为估计的经纬度信息
dftaxaDB3_noPos <- dftaxaDB %>%
filter(is.na(Latitude))
# write_csv(dftaxaDB3_noPos,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/001_taxa_noPosition.csv")
### 写出具体的亚群,手动添加地区注释
dftable <- dftaxaDB %>%
filter(!is.na(GroupforExpansionLoad)) %>%
distinct(GroupforExpansionLoad,.keep_all = T) %>%
select(GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2)
# write_csv(dftable,file = "~/Documents/taxaDB_category.csv")
plotfun_map <- function(fun_df,fun_factor){
mp <- NULL #定义一个空的地图
# 注意: borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0") #绘制基本地图
mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
mp <- mpvoid +
geom_point(
data = fun_df,
aes(x = Longitude, y = Latitude),
shape = 21,
alpha = 0.8,
size = 1.5,
fill = "orange"
) +
labs(title = fun_factor)+
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 6))+
theme(legend.position = 'none')
mp
ggsave(mp,filename = str_c("~/Documents/",fun_factor,"_map.pdf"),height = 1.38,width = 3.445)
return(mp)
}
#### 需要修饰的方面
# 1.调节因子顺序
# 2.标题居中,调节字体大小
# 3.调节圆点大小
### 建立2个一模一样的表格,为创建矩阵做准备
df2 <- dftaxaDB2 %>%
filter(!is.na(GroupforExpansionLoad)) %>%
# mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorAo,labels = factorAo)) %>%
# arrange(factor(GroupforExpansionLoad,levels = factorAo)) %>%
group_by(as.character(GroupforExpansionLoad)) %>%
group_map(~plotfun_map(.x,.y))
### 图片保存
q <- ggpubr::ggarrange(plotlist=df2,ncol = 4,nrow = 6)
q

cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
# factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
# factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Xinjiang wheat","Macha","Spelt","Strangulata")
### "Georgian wheat","Vavilovii","Yunan wheat"
# q <- ggpubr::ggarrange(plotlist=list(df2[[22]],df2[[3]],df2[[4]],df2[[7]],df2[[17]],
# df2[[18]],df2[[]],df2[[]],df2[[]],df2[[]],
# df2[[]],df2[[]],df2[[]],df2[[]],df2[[]],
# df2[[]],df2[[]],df2[[]],df2[[]],df2[[]],
# df2[[]],df2[[]],df2[[]])
# ,ncol = 4,nrow = 6)
重要 MAP & Latitude estimation by manual
### ********************** Section1 *********************************###
# df_all <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/002_taxaDB_GroupforExpansionLoad.txt")
#
### step1: 写出没有经纬度信息的336份材料,根据国家添加人为估计的经纬度信息
### "Georgian wheat","Vavilovii","Yunan wheat" 47个国家
#
# df_noPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/001_taxa_noPosition.csv")
#
### step2: 输出没有地理位置信息的国家,手动添加信息
# dftest <- df_noPosition %>%
# distinct(Country,.keep_all = T) %>%
# select(4:7)
### write_csv(dftest,file = "~/Documents/country_noLatitude.csv")
#
### step3: 336份没有经纬度信息的,加上经纬度信息和新的一列(是否是自己手动添加经纬度)
# df_manualPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/002_country_noLatitude.csv")
#
# df_336 <- df_noPosition %>%
# select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Country) %>%
# left_join(df_manualPosition,by="Country") %>%
# mutate(IfLatitudebyManually="Yes") %>%
# relocate(Country,.after = orgCty)
### write_csv(df_336,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/003_taxa_noPosition_addbyManually.csv")
#
### step4: 有些没有国家的材料,有种群信息,没有国家信息,但是可以根据种群信息手动添加经纬度信息。
### 这一步进行剩余10几份材料(15)的手动添加
### df_336_withManuallyPosition
#
### step5: 1062-336= 726 份材料添加注释信息
# df_726_withPosition <- df_all %>%
# anti_join(df_336,by="Taxa") %>%
# mutate(IfLatitudebyManually="No") %>%
# mutate(IfHasCountryInfo="Yes")
#
# df_336_withManuallyPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/004_taxa_noPosition_addbyManually_secondTime.csv")
#
# df_1062_withPosition <- df_726_withPosition %>%
# bind_rows(df_336_withManuallyPosition) %>%
# arrange(Taxa)
#
### write_tsv(df_1062_withPosition,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt")
### write_csv(df_1062_withPosition,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.csv")
### step6: 添加 6个群的信息
# dftaxaDB <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/021_WheatVMap2_GermplasmInfo.txt") %>% select(Taxa=VcfID,Subspecies_by6_TreeValidated)
#
# df_1062_withPosition <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt")
#
# df_1062_update <- df_1062_withPosition %>%
# left_join(dftaxaDB,by="Taxa") %>%
# relocate(Subspecies_by6_TreeValidated,.after = GroupforExpansionLoad2)
###write_tsv(df_1062_update,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt")
### ------------ 开始画地图 ------------ ###
### 先建立函数
plotfun_map <- function(fun_df,fun_factor){
mp <- NULL #定义一个空的地图
# 注意: borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0") #绘制基本地图
mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
mp <- mpvoid +
geom_point(
data = fun_df,
aes(x = Longitude, y = Latitude),
shape = 21,
alpha = 0.8,
size = 1.5,
fill = "orange"
) +
labs(title = fun_factor)+
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 6))+
theme(legend.position = 'none')
mp
ggsave(mp,filename = str_c("~/Documents/",fun_factor,"_map.pdf"),height = 1.38,width = 3.445)
return(mp)
}
#### 需要修饰的方面
# 1.调节因子顺序
# 2.标题居中,调节字体大小
# 3.调节圆点大小
### 建立2个一模一样的表格,为创建矩阵做准备
### 写出具体的亚群,手动添加地区注释
df2 <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
# mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorAo,labels = factorAo)) %>%
# arrange(factor(GroupforExpansionLoad,levels = factorAo)) %>%
group_by(as.character(GroupforExpansionLoad)) %>%
group_map(~plotfun_map(.x,.y))
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 1062 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, or...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Removed 2 rows containing missing values (geom_point).
### 图片保存
# q <- ggpubr::ggarrange(plotlist=df2,ncol = 4,nrow = 7)
#
# q
#
# cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
### 按照自己定义的顺序进行图片保存
q <- ggpubr::ggarrange(plotlist=list(df2[[24]],df2[[3]],df2[[4]],df2[[8]],df2[[18]],
df2[[19]],df2[[5]],df2[[7]],df2[[17]],
df2[[15]],df2[[20]],
df2[[9]],df2[[10]],df2[[11]],df2[[12]],df2[[13]],
df2[[14]],df2[[2]],df2[[16]],
df2[[1]],df2[[6]],df2[[22]],df2[[23]],df2[[25]],df2[[26]],
df2[[21]]
),
labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 4,nrow = 7)
## Warning: Removed 2 rows containing missing values (geom_point).
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
======
****DAF
plot by 642 nonsyn and syn
- 注意:该方法将 del 列为 nonsyn,不画出 del
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz",show_col_types = FALSE)
### step2: DAF bin
df1 <- df %>%
mutate(VariantsType = ifelse(LoadGroup=="Synonymous","Synonymous","Nonsynonymous")) %>%
select(-LoadGroup) %>%
rename(LoadGroup=VariantsType) %>%
rename(DAF=AAF) %>%
filter(DAF!=0) %>%
filter(DAF!=1) %>%
mutate(bin=cut_width(DAF,width = 0.1,boundary = 0)) %>%
group_by(GenomeType,Sub,LoadGroup,Group) %>%
count(bin) %>%
mutate(fre=n/sum(n))
df3 <- df %>%
mutate(VariantsType = ifelse(LoadGroup=="Synonymous","Synonymous","Nonsynonymous")) %>%
select(-LoadGroup) %>%
rename(LoadGroup=VariantsType) %>%
rename(DAF=AAF) %>%
filter(DAF!=0) %>%
filter(DAF!=1) %>%
mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
group_by(GenomeType,Sub,LoadGroup,Group) %>%
count(bin) %>%
mutate(fre=n/sum(n))
### step3: plot_function
fun_plotDAF <- function(df_fun,factor_fun){
### df_fun 提取因子信息
fun_genometype <- factor_fun$GenomeType
fun_sub <- factor_fun$Sub
fun_loadgroup <- factor_fun$LoadGroup
### 设置因子顺序,由于 设置的 0.1 bin 导致有些亚群没有颜色信息
factorA <- df3 %>%
filter(GenomeType == fun_genometype,
Sub == fun_sub,
LoadGroup == fun_loadgroup) %>%
filter(bin=="[0,0.2]") %>%
arrange(-fre) %>%
pull(Group)
### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
dfhash <-
read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt",show_col_types = FALSE) %>%
select(GenomeType, GroupforExpansionLoad,GroupforExpansionLoad2) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
distinct(., GroupforExpansionLoad, .keep_all = T)
### 每个亚群的颜色信息
dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt",show_col_types = FALSE) %>%
filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>%
left_join(dfhash, by = c("Group" = "GroupforExpansionLoad2")) %>%
### 过滤数据,使之和输入的数据框 df_fun 保持一致
filter(GenomeType == fun_genometype) %>%
mutate(Label=factor(Label,levels = factorA)) %>%
arrange(as.factor(Label))
### 设置数据框和画图颜色的因子顺序,使都保持一致
colB <- dfcolorDB$ColorCode
df_fun$Group <- factor(df_fun$Group,levels = factorA)
q <- df_fun %>%
ggplot(aes(
x = bin,
y = fre,
group = Group,
color = Group
)) +
geom_line(alpha = 0.9) +
labs(y=str_c("Proportion of ",str_to_lower(fun_loadgroup)," SNPs"),
x="Derived allele frequency")+
# labs(y = "Proportion of nonsynonymous SNPs", x = "Derived allele frequency") +
scale_color_manual(values = colB) +
# scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
# scale_fill_manual(values = colB)+
# scale_y_continuous(limits = c(0,1))+
scale_x_discrete(label = c("0.1", "", "0.3", "", "0.5", "", "0.7", "", "0.9", "")) +
# facet_grid(Sub~Group)+
theme(strip.background = element_blank()) +
# theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
theme(
# plot.margin=unit(rep(1,4),'lines'),
panel.background = element_blank(),
panel.border = element_rect(
colour = "black",
fill = NA,
size = 0.7
),
legend.key = element_blank(),
legend.position = 'none',
text = element_text(size = 9)
)
q
# ggsave(q,filename = "~/Documents/temp.png",height = 6,width = 8)
fun_genometype <- factor_fun$GenomeType
fun_sub <- factor_fun$Sub
fun_loadgroup <- factor_fun$LoadGroup
filename_fun <- str_c("~/Documents/",fun_genometype,"_",fun_sub,"Sub","_",fun_loadgroup,".pdf")
ggsave(q,
# filename = "~/Documents/test.pdf",
filename = filename_fun,
height = 4.2,
width = 7) ### 刚好可以把所有图例展示出来
### 打出出图顺序
print(filename_fun)
return(q)
}
### step4: invoke the plot function
df2 <- df1 %>%
group_by(GenomeType,Sub,LoadGroup) %>%
group_map(~fun_plotDAF(.x,.y))
## [1] "~/Documents/AABB_ASub_Nonsynonymous.pdf"
## [1] "~/Documents/AABB_ASub_Synonymous.pdf"
## [1] "~/Documents/AABB_BSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABB_BSub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_ASub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_ASub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_BSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_BSub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_DSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_DSub_Synonymous.pdf"
## [1] "~/Documents/DD_DSub_Nonsynonymous.pdf"
## [1] "~/Documents/DD_DSub_Synonymous.pdf"
## step5: combine figs automatically or manually
### 先保存6倍体的,再保存4倍体的,最后2倍体
q <- ggpubr::ggarrange(plotlist=df2,ncol = 3)
q
## $`1`

##
## $`2`

##
## $`3`

##
## $`4`

##
## attr(,"class")
## [1] "list" "ggarrange"
### 六倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[6]],df2[[8]],df2[[10]],df2[[5]],df2[[7]],df2[[9]]),
ncol = 3,nrow = 2)
q

cowplot::save_plot(q,filename = "~/Documents/AABBDD.pdf",base_height = 4.45,base_width = 7.2)
### 四倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[2]],df2[[4]],
df2[[1]],df2[[3]]),
ncol = 2,nrow = 2)
q

cowplot::save_plot(q,filename = "~/Documents/AABB.pdf",base_height = 4.45,base_width = 7.2)
plot by 642 del nonsyn and syn
- 注意:该方法将 del 列为 nonsyn,不画出 del
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz",show_col_types = FALSE)
### step2: DAF bin
df1 <- df %>%
rename(DAF=AAF) %>%
filter(DAF!=0) %>%
filter(DAF!=1) %>%
mutate(bin=cut_width(DAF,width = 0.1,boundary = 0)) %>%
group_by(GenomeType,Sub,LoadGroup,Group) %>%
count(bin) %>%
mutate(fre=n/sum(n))
df3 <- df %>%
rename(DAF=AAF) %>%
filter(DAF!=0) %>%
filter(DAF!=1) %>%
mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
group_by(GenomeType,Sub,LoadGroup,Group) %>%
count(bin) %>%
mutate(fre=n/sum(n))
### step3: plot_function
fun_plotDAF <- function(df_fun,factor_fun){
### df_fun 提取因子信息
fun_genometype <- factor_fun$GenomeType
fun_sub <- factor_fun$Sub
fun_loadgroup <- factor_fun$LoadGroup
### 设置因子顺序,由于 设置的 0.1 bin 导致有些亚群没有颜色信息
factorA <- df3 %>%
filter(GenomeType == fun_genometype,
Sub == fun_sub,
LoadGroup == fun_loadgroup) %>%
filter(bin=="[0,0.2]") %>%
arrange(-fre) %>%
pull(Group)
### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
dfhash <-
read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt",show_col_types = FALSE) %>%
select(GenomeType, GroupforExpansionLoad,GroupforExpansionLoad2) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
distinct(., GroupforExpansionLoad, .keep_all = T)
### 每个亚群的颜色信息
dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt",show_col_types = FALSE) %>%
filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>%
left_join(dfhash, by = c("Group" = "GroupforExpansionLoad2")) %>%
### 过滤数据,使之和输入的数据框 df_fun 保持一致
filter(GenomeType == fun_genometype) %>%
mutate(Label=factor(Label,levels = factorA)) %>%
arrange(as.factor(Label))
### 设置数据框和画图颜色的因子顺序,使都保持一致
colB <- dfcolorDB$ColorCode
df_fun$Group <- factor(df_fun$Group,levels = factorA)
q <- df_fun %>%
ggplot(aes(
x = bin,
y = fre,
group = Group,
color = Group
)) +
geom_line(alpha = 0.9) +
labs(y=str_c("Proportion of ",str_to_lower(fun_loadgroup)," SNPs"),
x="Derived allele frequency")+
# labs(y = "Proportion of nonsynonymous SNPs", x = "Derived allele frequency") +
scale_color_manual(values = colB) +
# scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
# scale_fill_manual(values = colB)+
# scale_y_continuous(limits = c(0,1))+
scale_x_discrete(label = c("0.1", "", "0.3", "", "0.5", "", "0.7", "", "0.9", "")) +
# facet_grid(Sub~Group)+
theme(strip.background = element_blank()) +
# theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
theme(
# plot.margin=unit(rep(1,4),'lines'),
panel.background = element_blank(),
panel.border = element_rect(
colour = "black",
fill = NA,
size = 0.7
),
legend.key = element_blank(),
legend.position = 'none',
text = element_text(size = 9)
)
q
# ggsave(q,filename = "~/Documents/temp.png",height = 6,width = 8)
fun_genometype <- factor_fun$GenomeType
fun_sub <- factor_fun$Sub
fun_loadgroup <- factor_fun$LoadGroup
filename_fun <- str_c("~/Documents/",fun_genometype,"_",fun_sub,"Sub","_",fun_loadgroup,".pdf")
ggsave(q,
# filename = "~/Documents/test.pdf",
filename = filename_fun,
height = 4.2,
width = 7) ### 刚好可以把所有图例展示出来
### 打出出图顺序
print(filename_fun)
return(q)
}
### step4: invoke the plot function
df2 <- df1 %>%
group_by(GenomeType,Sub,LoadGroup) %>%
group_map(~fun_plotDAF(.x,.y))
## [1] "~/Documents/AABB_ASub_Deleterious.pdf"
## [1] "~/Documents/AABB_ASub_Nonsynonymous.pdf"
## [1] "~/Documents/AABB_ASub_Synonymous.pdf"
## [1] "~/Documents/AABB_BSub_Deleterious.pdf"
## [1] "~/Documents/AABB_BSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABB_BSub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_ASub_Deleterious.pdf"
## [1] "~/Documents/AABBDD_ASub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_ASub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_BSub_Deleterious.pdf"
## [1] "~/Documents/AABBDD_BSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_BSub_Synonymous.pdf"
## [1] "~/Documents/AABBDD_DSub_Deleterious.pdf"
## [1] "~/Documents/AABBDD_DSub_Nonsynonymous.pdf"
## [1] "~/Documents/AABBDD_DSub_Synonymous.pdf"
## [1] "~/Documents/DD_DSub_Deleterious.pdf"
## [1] "~/Documents/DD_DSub_Nonsynonymous.pdf"
## [1] "~/Documents/DD_DSub_Synonymous.pdf"
## step5: combine figs automatically or manually
# ### 先保存6倍体的,再保存4倍体的,最后2倍体
# q <- ggpubr::ggarrange(plotlist=df2,ncol = 3)
#
# q
### 六倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[9]],df2[[12]],df2[[15]],
df2[[8]],df2[[11]],df2[[14]],
df2[[7]],df2[[10]],df2[[13]]),
ncol = 3,nrow = 3)
q

cowplot::save_plot(q,filename = "~/Documents/AABBDD_syn_nonsyn_del.pdf",base_height = 7.2,base_width = 7.2)
### 四倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[3]],df2[[6]],
df2[[2]],df2[[5]],
df2[[1]],df2[[4]]),
ncol = 2,nrow = 3)
q

cowplot::save_plot(q,filename = "~/Documents/AABB_syn_nonsyn_del.pdf",base_height = 7.2,base_width = 7.2)
======
****Heter by individual
factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
### 该数据框是为了根据26亚群找到对应的基因组类型
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(Taxa,GroupforExpansionLoad,Subspecies_by6_TreeValidated)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 该数据框是画图 heter 时所用的数据
df1 <- read.delim("data/003_VCF_QC/AABBDD_taxa_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_taxa_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_taxa_QC.txt.gz")
df <- rbind(df1,df2,df3) %>%
mutate(GenomeType = factor(GenomeType,levels = c("AABB","AABBDD","DD"))) %>%
left_join(dfhash,by = "Taxa") %>%
filter(!is.na(GroupforExpansionLoad))
### 统计个体按照 heter 的排序
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c( "Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids",
"OtherTetraploids",
"Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
### 下一步记得排序的时候加上大群分类
group_by(GenomeType,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(HeterozygousProportion = mean(HeterozygousProportion)) %>%
arrange(GenomeType,Subspecies_by6_TreeValidated,-HeterozygousProportion) %>%
distinct(GroupforExpansionLoad,.keep_all = F)
## `summarise()` has grouped output by 'GenomeType', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- df %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
distinct(.,GroupforExpansionLoad,.keep_all = T) %>%
mutate(GroupforExpansionLoad_AddCount =
str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 0.17
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=HeterozygousProportion))+
ylab("Heterozygous genotype\nproportion by individual")+
xlab("")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(position = position_dodge(0.7),
outlier.color = 'black',
outlier.alpha = 0.8,
outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(fun=mean, geom="point", color="red",position = position_dodge(1),size=0.5)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c('#ffd702','#fc6e6e',"#87cef9"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
text = element_text(size = 10),legend.position = 'none')
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 3.5,width = 7)
### 求四倍体 六倍体 二倍体的杂合度平均值
dfave <- df %>%
group_by(GenomeType) %>%
summarise(mean_heter=mean(HeterozygousProportion))
dfave
## # A tibble: 3 × 2
## GenomeType mean_heter
## <fct> <dbl>
## 1 AABB 0.0147
## 2 AABBDD 0.00654
## 3 DD 0.0169
# AABB 0.014694332
# AABBDD 0.006543548
# DD 0.016855904
======
****ANGSD
thetaW_ANGSD
### 对区间的 pi 进行总结,求平均值
# df <- read_tsv("data/014_PopParameters/003_angsd/001_angsd_subspecies26_geneRegion_RefChr.txt.gz")
#
# df1 <- df %>%
# group_by(RefChr,Group) %>%
# summarise(across(.cols = c(4:13),.fns = ~ mean(.x, na.rm = TRUE),.names = "{.col}_mean"))
#
### write_tsv(df1,file = "data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") ### 注意结果中含有0 NA
### 分组数据
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 画图数据
df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>%
filter(tW_mean!=0) %>%
mutate(Sub=str_sub(RefChr,2,2)) %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 469 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): RefChr, Group
## dbl (10): tW_mean, tP_mean, tF_mean, tH_mean, tL_mean, Tajima_mean, fuf_mean...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照A亚基因组排序
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(tW_mean)) %>%
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
### 计数数据
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 90
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=tW_mean))+
labs(y= expression(italic(theta)[italic(W)]), x = "")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
# legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/thetaW_ANGSD.pdf",height = 3.5,width = 7)
thetaP_ANGSD
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>%
filter(tP_mean!=0) %>%
mutate(Sub=str_sub(RefChr,2,2)) %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 469 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): RefChr, Group
## dbl (10): tW_mean, tP_mean, tF_mean, tH_mean, tL_mean, Tajima_mean, fuf_mean...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(tP_mean)) %>%
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 150
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=tP_mean))+
labs(y= expression(italic(theta)[italic(pi)]), x = "")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
# legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/thetaP_ANGSD.pdf",height = 3.5,width = 7)
TajimaD_ANGSD
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>%
filter(tW_mean!=0) %>%
mutate(Sub=str_sub(RefChr,2,2)) %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 469 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): RefChr, Group
## dbl (10): tW_mean, tP_mean, tF_mean, tH_mean, tL_mean, Tajima_mean, fuf_mean...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(Tajima_mean)) %>%
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 4.5
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=Tajima_mean))+
labs(y= expression(paste("Tajima's ",italic(D))), x = "")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
# legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/TajimaD_ANGSD.pdf",height = 3.5,width = 7)
======
****VCFtools
pi_VCFtools by individual
### 对区间的 pi 进行总结,求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/001_pi/001_pi.txt.gz")
#
# df1 <- df %>%
# group_by(CHROM,Group) %>%
# summarise(mean_pi = mean(PI))
#
# write_tsv(df1,file = "data/014_PopParameters/001_pi/002_pi_summary.txt.gz")
### 开始处理 pi 的结果
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/001_pi/002_pi_summary.txt.gz") %>%
mutate(Sub=str_sub(CHROM,2,2)) %>%
filter(Group!="No1B1R", Group != "With1B1R") %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 475 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): CHROM, Group
## dbl (1): mean_pi
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照B亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(mean_pi)) %>%
### 下一步记得排序的时候加上大群分类
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 2.6
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_pi*1000))+
labs(y = expression(italic(theta)[pi]~"("~10^-3~")"),x="")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
# legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/pi_VCFtools.pdf",height = 3.5,width = 7)
pi_VCFtools by individual 竖版
### 对区间的 pi 进行总结,求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/001_pi/001_pi.txt.gz")
#
# df1 <- df %>%
# group_by(CHROM,Group) %>%
# summarise(mean_pi = mean(PI))
#
# write_tsv(df1,file = "data/014_PopParameters/001_pi/002_pi_summary.txt.gz")
### 开始处理 pi 的结果
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/001_pi/002_pi_summary.txt.gz") %>%
mutate(Sub=str_sub(CHROM,2,2)) %>%
filter(Group!="No1B1R", Group != "With1B1R") %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 475 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): CHROM, Group
## dbl (1): mean_pi
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照B亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(mean_pi)) %>%
### 下一步记得排序的时候加上大群分类
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'GenomeType', 'Sub', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 2.6
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_pi*1000))+
labs(y = expression(italic(theta)[pi]~"("~10^-3~")"),x="")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
geom_hline(yintercept = 1,linetype="dashed",color="black")+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
coord_flip()+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/pi_VCFtools_竖版.pdf",height = 7,width = 3.5)
TajiamaD_VCFtools
### 对区间的 pi 进行总结,求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/002_tajimaD/001_TajimaD.txt.gz")
#
# df1 <- df %>%
# group_by(CHROM,Group) %>%
# summarise(mean_tajimaD = mean(TajimaD,na.rm = T))
#
# write_tsv(df1,file = "data/014_PopParameters/002_tajimaD/002_TajimaD_summary.txt.gz")
### 开始处理 pi 的结果
dfori <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfhash <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>%
distinct(GroupforExpansionLoad,.keep_all = T)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_tsv("data/014_PopParameters/002_tajimaD/002_TajimaD_summary.txt.gz") %>%
mutate(Sub=str_sub(CHROM,2,2)) %>%
left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))
## Rows: 469 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): CHROM, Group
## dbl (1): mean_tajimaD
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 统计每个亚基因组内,按照B亚基因组排序
dfV1 <- df %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
group_by(Sub,GenomeType,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>%
summarise(mean = mean(mean_tajimaD)) %>%
arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>%
ungroup() %>%
filter(Sub=="A") %>%
select(GroupforExpansionLoad) %>%
add_row(GroupforExpansionLoad="Strangulata")
## `summarise()` has grouped output by 'Sub', 'GenomeType', 'Subspecies_by6_TreeValidated'. You can override using the `.groups` argument.
### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数,并添加到taxa数据库后面
dftaxaCount <- dfori %>%
select(GroupforExpansionLoad) %>%
group_by(GroupforExpansionLoad) %>%
count() %>%
ungroup() %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
select(-n)
yvalue <- 1.6
p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_tajimaD))+
labs(y= expression(paste("Tajima's ",italic(D))), x = "")+
### 加阴影
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
# 最上面的条纹
geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
# 开始画图
# stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
# geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
# stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
### boxplot
geom_boxplot(
aes(color=Sub),
position = position_dodge(0.8),
outlier.color = NA,
# outlier.alpha = 0.8,
# outlier.size = 0.7,
alpha=0.8,width=0.7)+ #,width=0.0.7
stat_summary(
aes(group=Sub),
color="black",
fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
### boxplot and point
# geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
# geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
theme(plot.margin=unit(rep(1.3,4),'lines'))+
theme(
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
# legend.position = 'none',
# legend.position = c(0.9,0.6),
text = element_text(size = 10))
p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/TajimaD_VCFtools.pdf",height = 3.5,width = 7)
======
PCA
Hexaploid by subspecise by founder effect
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>%
filter(ColorCategory=="GroupforExpansionLoad") %>%
select(Label,ColorCode)
## Rows: 74 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Group, Label, ColorCode, ColorCategory
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>%
left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_factor <- dftaxaDB %>%
distinct(GroupforExpansionLoad,.keep_all = T) %>%
filter(GenomeType=="AABBDD") %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>%
mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") | GroupforExpansionLoad=="OtherHexaploids" ,"gray90",ColorCode)) %>%
mutate(ColorCode=if_else(GroupforExpansionLoad=="Club wheat","black",ColorCode))
df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>%
left_join(dftaxaDB,by="Taxa") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
## Rows: 814 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Taxa
## dbl (5): PC1, PC2, PC3, PC4, PC5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)
colB <- df_factor$ColorCode
df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")
## Rows: 5 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (2): PC, eigenvalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 提取变异解释度并合并数据
variance1 <- paste(round(df_eig[1,2],3),"%",sep="") ##第一个成分
variance2 <- paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分
### start to plot
p <- df %>%
ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
# ,legend.position = 'none'
,text = element_text(size = 12))
p

ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
fig <- df %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
text = ~paste(Taxa),
colors = colB)
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Hexaploid by 26pops 只加 landrace
library(factoextra)
library(FactoMineR)
df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>%
filter(ColorCategory=="GroupforExpansionLoad") %>%
select(Label,ColorCode)
## Rows: 74 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Group, Label, ColorCode, ColorCategory
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>%
left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_factor <- dftaxaDB %>%
distinct(GroupforExpansionLoad,.keep_all = T) %>%
filter(GenomeType=="AABBDD") %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>%
mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") ,ColorCode ,"gray90"))
df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>%
left_join(dftaxaDB,by="Taxa") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("OtherHexaploids","Cultivar","Landrace"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
## Rows: 814 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Taxa
## dbl (5): PC1, PC2, PC3, PC4, PC5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)
colB <- df_factor$ColorCode
df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")
## Rows: 5 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (2): PC, eigenvalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 提取变异解释度并合并数据
variance1 <- paste(round(df_eig[1,2],3),"%",sep="") ##第一个成分
variance2 <- paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分
### start to plot
p <- df %>%
ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
# ,legend.position = 'none'
,text = element_text(size = 12))
p

ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)
library(plotly)
fig <- df %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
text = ~paste(Taxa),
colors = colB)
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
macha
A subgenome
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>%
select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)
### import matrix
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
df <- read.delim(dataFile)
### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")
### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)
### 提取变异解释度并合并数据
variance1 <- paste(round(df.pca$eig[1,3],2),"%",sep="") ##第一个成分
variance2 <- paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)
factorA <- c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
labelsA <- c("Wild emmer","Domesticated emmer","Free-threshing tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")
colB <- c("#ffd702","#7f5701","#016699","#00f3ff", "#fc6e6e","#9900ff","#fe63c2")
df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels=factorA,labels=labelsA)
### start to plot
p <- df %>%
# filter(! Taxa %in% c("AB_183","AB_185","AB_039")) %>%
ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by22_TreeValidated))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
# scale_fill_manual(values = colB,name="Genetic group")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
,legend.position = 'none'
,text = element_text(size = 12))
p

ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
library(plotly)
fig <- df %>%
mutate(Subspecies_by22_TreeValidated=if_else(Subspecies_by6_TreeValidated=="Free-threshing tetraploids","OtherTetraploids",Subspecies_by22_TreeValidated)) %>%
plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22_TreeValidated,
text = ~paste(Taxa),
colors = colB)
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
macha 聚类查看
library(factoextra)
library(FactoMineR)
df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>%
filter(ColorCategory=="GroupforExpansionLoad") %>%
select(Label,ColorCode)
## Rows: 74 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Group, Label, ColorCode, ColorCategory
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>%
left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_factor <- dftaxaDB %>%
distinct(GroupforExpansionLoad,.keep_all = T) %>%
filter(GenomeType=="AABBDD") %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>%
mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") | GroupforExpansionLoad=="OtherHexaploids" ,"gray90",ColorCode)) %>%
mutate(ColorCode=if_else(GroupforExpansionLoad=="Club wheat","black",ColorCode))
df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>%
left_join(dftaxaDB,by="Taxa") %>%
filter(!is.na(GroupforExpansionLoad)) %>%
mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
## Rows: 814 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Taxa
## dbl (5): PC1, PC2, PC3, PC4, PC5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)
colB <- df_factor$ColorCode
df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")
## Rows: 5 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (2): PC, eigenvalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 提取变异解释度并合并数据
variance1 <- paste(round(df_eig[1,2],3),"%",sep="") ##第一个成分
variance2 <- paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分
### start to plot
p <- df %>%
ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
# geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
geom_point(alpha=0.8,size=2,color="gray",shape=21)+
geom_hline(yintercept=0, linetype="dashed", color = "black")+
geom_vline(xintercept = 0, linetype="dashed", color = "black")+
# scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
# scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
# scale_color_manual(values = colB,name="Genetic group")+
scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
# scale_shape_manual(values = shapeB)+
labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
theme_classic()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
,panel.background = element_blank()
,axis.line = element_line(size=0.3, colour = "black")
# ,legend.position = 'none'
,text = element_text(size = 12))
p

ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)
library(plotly)
fig <- df %>%
# filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
# filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
# filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
# filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
text = ~paste(Taxa),
colors = colB)
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
====== Load Map and boxplot
plot Bubble map vivid 主题
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
select(Taxa=VcfID,GenomeType,Longitude,Latitude)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
# select(Taxa,GenomeType,Longitude,Latitude,GroupforExpansionLoad,Subspecies_by6_TreeValidated)
df <- dfdel %>%
left_join(dftaxaDB,by="Taxa") %>%
filter(GenomeType=="AABBDD") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
filter(Stand_LoadGroup == choiceA) %>% ### choice
select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) %>% ### 802个个体 -> 543 个有经纬度信息
filter(!Taxa=="ABD_0047") %>%
arrange(Stand_delCount) #### 排序
mp<-NULL #定义一个空的地图
# 注意: borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
# mapworld<-borders("world",colour = "#dedfe0",fill="gray") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()
# mpvoid<-ggplot()+ mapworld + ylim(0,70)+xlim(-20,147)+theme_void()
library(viridis)
mp<-mpvoid +
geom_point(data = df,mapping = aes(x=Longitude,y=Latitude,
size=Stand_delCount, color=Stand_delCount),
alpha = 0.6)+
scale_size_continuous(range=c(0.35,1.2))+ ## 0.5-3
# scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))+
# scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
# scale_fill_gradientn(colours =rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
# scale_color_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))
scale_color_viridis(trans="log") +
# scale_fill_viridis(trans="log")+
# scale_color_gradient2(low = "blue", high = "red", mid = "white",
# midpoint = 0.082, limit = c(0.07,0.0865),name="test") +
# theme(legend.position = 'none')
mp
ggsave(mp,filename = "figs/map_s1062_2.pdf", height = 2.76,width = 6.89)
plot Bubble map 循环画图
- 附件将 nonsyn pph2 gerp1.5 gerp2.14 gerp3.1 vep 画出来
### plot function
fun_plotMapLoad <- function(df_fun,factor_fun){
# factor_fun <- str_sub(factor_fun,11)
mp <- NULL #定义一个空的地图
mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0")
mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
library(viridis)
mp <- mpvoid +
geom_point(
data = df_fun,
mapping = aes(
x = Longitude,
y = Latitude,
size = Stand_delCount,
color = Stand_delCount
# fill = Stand_delCount
),
alpha = 0.8
) +
labs(title = factor_fun)+
theme(plot.title = element_text(hjust = 0.5))+
scale_size_continuous(range = c(1, 3)) + ## 0.5-3
# scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
# scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
scale_color_viridis(trans = "log")+
# scale_fill_viridis(trans = "log")+
guides(color = guide_colorbar(order = 1,title = "Mutation load"),
size = "none"
# size = guide_legend(order = 2,reverse = T)
)
mp
filename_fun <- str_c("~/Documents/","map_",factor_fun,".pdf")
print(filename_fun)
ggsave(mp,
filename = filename_fun,
height = 2.76,
width = 6.89)
return(mp)
}
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read.delim("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/020_WheatVMap2_GermplasmInfo.txt") %>% select(Taxa=VcfID,GenomeType,Longitude,Latitude)
factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")
factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")
df <- dfdel %>%
right_join(dftaxaDB,by="Taxa") %>%
filter(GenomeType=="AABBDD") %>%
filter(Subgenome=="D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
# filter(Stand_LoadGroup == choiceA) %>% ### choice
filter(Stand_LoadGroup %in% factorA) %>%
mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>%
mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>%
filter(Stand_delCount!= 0) %>%
select(-c("GenomeType","Subgenome","DelCountGroup")) %>%
filter(!is.na(Longitude)) %>% ### 802个个体 -> 543 个有经纬度信息
group_by(Stand_LoadGroup) %>%
arrange(Stand_delCount) %>% #### 排序
group_map(~fun_plotMapLoad(.x,.y))
## [1] "~/Documents/map_GERP>1.5.pdf"
## [1] "~/Documents/map_GERP>2.14.pdf"
## [1] "~/Documents/map_GERP>3.1.pdf"
## [1] "~/Documents/map_Nonsynonymous.pdf"
## [1] "~/Documents/map_PPH2.pdf"
## [1] "~/Documents/map_PPH2_Probably.pdf"
## [1] "~/Documents/map_VEP_HighImpact.pdf"
q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
df[[1]],df[[2]],df[[3]],
df[[7]]),
# labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 2,nrow = 4)
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
boxplot load hexaploid 多种情况循环画图
- 可以选择定义有害突变的方式 unique(dfdel$Stand_LoadGroup)
fun_plotMutationLoadBoxplot <- function(df,factor_fun){
dfV1 <- df %>%
group_by(Subgenome, GroupforExpansionLoad) %>%
summarise(mean = mean(Stand_delCount)) %>%
arrange(Subgenome, mean) %>%
ungroup() %>%
filter(Subgenome == "D") %>%
select(GroupforExpansionLoad)
### 因子排序,排序后计数
df$GroupforExpansionLoad <-
factor(df$GroupforExpansionLoad, levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数
### 千万注意数据框的分组情况,如sub VariantsGroup ,计算后核查确认!!!
dftaxaCount <- df %>%
group_by(Subgenome, GroupforExpansionLoad) %>%
count() %>% ungroup() %>%
distinct(GroupforExpansionLoad, .keep_all = T) %>%
select(-Subgenome) %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad, " (", n, ")", sep = "")) %>%
select(-n)
library(RColorBrewer)
p <- df %>%
ggplot(aes(x = GroupforExpansionLoad, y = Stand_delCount)) +
labs(y = str_c("Individual-based mutation load"),
x = "",
title = factor_fun) +
geom_boxplot(aes(fill = GroupforExpansionLoad),
outlier.color = NA,
alpha = 0.6) + ### width=0.1
geom_point(
position = position_jitterdodge(0.6),
alpha = 0.4,
aes(color = Subgenome),
size = 0.2
) +
stat_summary(
aes(group = Subgenome),
fun = mean,
geom = "point",
color = "blue",
position = position_dodge(1),
size = 0.7
) +
scale_color_manual(values = c("#fd8582", "#967bce", "#4bcdc6"),
name = '') +
scale_fill_manual(values = colorRampPalette(rev(brewer.pal(
n = 1, name = "RdBu"
)))(16)) +
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount) +
theme(strip.background = element_blank()) +
# theme(plot.margin=unit(rep(1.3,4),'lines'))+
coord_flip() +
theme(
plot.title = element_text(hjust = 0.5),
panel.background = element_blank(),
panel.border = element_rect(
colour = "black",
fill = NA,
size = 0.4
),
text = element_text(size = 7),
legend.position = 'none'
)
p
filename_fun <- str_c("~/Documents/",factor_fun,".pdf")
ggsave(p,
# filename = "/Users/Aoyue/Documents/test.pdf",
filename = filename_fun,
height = 4.2,
width = 2.8)
print(filename_fun)
return(p)
}
### Begin
factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")
factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")
### ********************** Section1 *********************************###
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(Taxa,GenomeType,GroupforExpansionLoad)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### ********************** Section2 *********************************###
### del load 数据框
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdel %>%
left_join(dftaxaDB, by = "Taxa") %>%
filter(GenomeType == "AABBDD") %>%
filter(Subgenome == "D") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
select(Taxa,Subgenome,Stand_LoadGroup,Stand_delCount,GroupforExpansionLoad) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
filter(Stand_LoadGroup %in% factorA) %>%
mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>%
mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>%
group_by(Stand_LoadGroup) %>%
group_map(~fun_plotMutationLoadBoxplot(.x,.y))
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/GERP>1.5.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/GERP>2.14.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/GERP>3.1.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/Nonsynonymous.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/PPH2.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/PPH2_Probably.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## Warning in brewer.pal(n = 1, name = "RdBu"): minimal value for n is 3, returning requested palette with 3 different levels
## [1] "~/Documents/VEP_HighImpact.pdf"
q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
df[[1]],df[[2]],df[[3]],
df[[7]]),
# labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 3,nrow = 3)
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
boxplot load tetraploid 多种情况循环画图
- 可以选择定义有害突变的方式 unique(dfdel$Stand_LoadGroup)
fun_plotMutationLoadBoxplot <- function(df,factor_fun){
dfV1 <- df %>%
group_by(Subgenome, GroupforExpansionLoad) %>%
summarise(mean = mean(Stand_delCount)) %>%
arrange(Subgenome, mean) %>%
ungroup() %>%
filter(Subgenome == "A") %>%
select(GroupforExpansionLoad)
### 因子排序,排序后计数
df$GroupforExpansionLoad <-
factor(df$GroupforExpansionLoad, levels = dfV1$GroupforExpansionLoad)
### 计算每个亚群的个数
### 千万注意数据框的分组情况,如sub VariantsGroup ,计算后核查确认!!!
dftaxaCount <- df %>%
group_by(Subgenome, GroupforExpansionLoad) %>%
count() %>% ungroup() %>%
distinct(GroupforExpansionLoad, .keep_all = T) %>%
select(-Subgenome) %>%
mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad, " (", n, ")", sep = "")) %>%
select(-n)
library(RColorBrewer)
p <- df %>%
ggplot(aes(x = GroupforExpansionLoad, y = Stand_delCount)) +
labs(y = str_c("Individual-based mutation load"),
x = "",
title = factor_fun) +
geom_boxplot(aes(fill = GroupforExpansionLoad),
outlier.color = NA,
alpha = 0.6) + ### width=0.1
geom_point(
position = position_jitterdodge(0.6),
alpha = 0.4,
aes(color = Subgenome),
size = 0.2
) +
stat_summary(
aes(group = Subgenome),
fun = mean,
geom = "point",
color = "blue",
position = position_dodge(1),
size = 0.7
) +
scale_color_manual(values = c("#fd8582", "#967bce", "#4bcdc6"),
name = '') +
scale_fill_manual(values = colorRampPalette(rev(brewer.pal(
n = 3, name = "RdBu"
)))(9)) +
scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount) +
theme(strip.background = element_blank()) +
# theme(plot.margin=unit(rep(1.3,4),'lines'))+
coord_flip() +
theme(
plot.title = element_text(hjust = 0.5),
panel.background = element_blank(),
panel.border = element_rect(
colour = "black",
fill = NA,
size = 0.4
),
text = element_text(size = 7),
legend.position = 'none'
)
p
filename_fun <- str_c("~/Documents/",factor_fun,".pdf")
ggsave(p,
# filename = "/Users/Aoyue/Documents/test.pdf",
filename = filename_fun,
height = 4.2,
width = 2.8)
print(filename_fun)
return(p)
}
### Begin
factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")
factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")
### ********************** Section1 *********************************###
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
select(Taxa,GenomeType,GroupforExpansionLoad)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### ********************** Section2 *********************************###
### del load 数据框
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdel %>%
left_join(dftaxaDB, by = "Taxa") %>%
filter(GenomeType == "AABB") %>%
filter(Subgenome == "A") %>%
filter(DelCountGroup == "TotalDerivedSNPCount") %>%
select(Taxa,Subgenome,Stand_LoadGroup,Stand_delCount,GroupforExpansionLoad) %>%
filter(!is.na(GroupforExpansionLoad)) %>%
filter(Stand_LoadGroup %in% factorA) %>%
mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>%
mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>%
group_by(Stand_LoadGroup) %>%
group_map(~fun_plotMutationLoadBoxplot(.x,.y))
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/GERP>1.5.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/GERP>2.14.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/GERP>3.1.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/Nonsynonymous.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/PPH2.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/PPH2_Probably.pdf"
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
## [1] "~/Documents/VEP_HighImpact.pdf"
q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
df[[1]],df[[2]],df[[3]],
df[[7]]),
# labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 3,nrow = 3)
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
====== Proportion in intron 3_UTR 5_UTR
dftotal
dftotal <- read_tsv("data/005_SNPAnnoDB/geneSNPAnno.txt.gz") %>%
mutate(Sub=str_sub(Transcript, 9, 9))
## Rows: 2672838 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): ID, Ref, Alt, Major, Minor, Transcript, Ancestral, Region, Variant...
## dbl (20): Chr, Pos, Maf, DAF, Gerp, Alt_SIFT, Ref_SIFT, Derived_SIFT, Gerp_1...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
# mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>%
# mutate(Sub=str_sub(Transcript, 9, 9)) %>%
dftotal_ancestral <- dftotal %>%
filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt)
### 每个位点的注释信息
df1 <- dftotal %>%
group_by(Region) %>%
filter(!is.na(Region)) %>%
filter(Region %in% c("Intron","UTR_3","UTR_5")) %>%
select(ID,Chr,Pos,Sub,Variants_Group=Region)
df2 <- dftotal %>%
anti_join(df1,by="ID") %>%
filter(!is.na(Variant_type)) %>%
filter( Variant_type %in% c("NONSYNONYMOUS","SYNONYMOUS","STOP-GAIN")) %>%
select(ID,Chr,Pos,Sub,Variants_Group=Variant_type)
df3 <- dftotal %>%
anti_join(df1,by="ID") %>%
anti_join(df2,by="ID") %>%
filter(Impact_VEP=="HIGH") %>%
filter(str_detect(Effect_VEP,"splice_")) %>%
mutate(Variants_Group="Essential splice") %>%
select(ID,Chr,Sub,Pos,Variants_Group)
factorA <- c("Intron","UTR_5","UTR_3","SYNONYMOUS","NONSYNONYMOUS","Essential splice","STOP-GAIN")
labelsA <- c("Intron","UTR_5","UTR_3","Synonymous","Nonsynonymous","Essential splice","Nonsense")
df_anno <- rbind(df1,df2,df3) %>%
mutate(Variants_Group=factor(Variants_Group,levels=factorA,labels = labelsA)) %>%
select(ID,Sub,Variants_Group)
### 合计 2670892 个位点得到注释
### ---------------------------------------------------------------------------
###每个亚群的AAF
### 注意这里的AAF中,有 ancestral 状态的为DAF,没有ancestral 状态的 AAF
### 因为使用的是 ancestralVCF 计算的AAF
dftotal_AAF <- read_tsv("data/015_AAF/001_aaf_bySubspecies/source/aaf_bySubspecies_filterAAF0.txt.gz")
## Rows: 19112004 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): Chr, Pos, AAF
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftotal_AAF_filter1 <- dftotal_AAF %>%
filter(AAF!=1)
### 合计 5343891 AAF 在0-1之间
### 联合上文求出的 ID,Chr,Pos,Variants_Group 对每个点进行注释,
### 联合 dftaxaDB,对每个亚群进行倍性的注释
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
distinct(GroupforExpansionLoad,.keep_all = T) %>%
select(GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2)
## Rows: 1062 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Taxa, GenomeType, GroupforExpansionLoad, GroupforExpansionLoad2, Su...
## dbl (2): Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfSNPdb <- dftotal_AAF_filter1 %>%
mutate(ID=str_c(Chr,"-",Pos)) %>%
left_join(df_anno,by="ID") %>%
filter(!is.na(Variants_Group))
dfSNPdb_count <- dfSNPdb %>%
group_by(Group,Variants_Group) %>%
count() %>%
left_join(dftaxaDB,by=c("Group"="GroupforExpansionLoad2"))
plot 全基因组各种变异类型的比例
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
df <- dfSNPdb_count %>%
mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>%
mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>%
group_by(GenomeType2) %>%
group_map(~{
### 画图函数
p <- .x %>%
ggplot(aes(x=GroupforExpansionLoad, y=n,fill=Variants_Group)) +
geom_bar(
stat = "identity",
# alpha = 1,
position = "fill",
alpha = 0.9
) +
# geom_vline(xintercept = 4.5,linetype = "solid",color = "black") +
# geom_text(aes(label= paste(prettyNum(ObservedCount,big.mark = ","))),position = position_fill(),hjust=1, size=3) +
# scale_fill_manual(values=colB)+
scale_fill_brewer(palette = "Accent") +
labs(y = "Frequency of categories by subspecies", x = "") +
coord_flip() +
theme(
panel.background = element_rect(
size = 0.7,
colour = 'black',
fill = 'white'
),
panel.grid = element_blank(),
text = element_text(size = 12)
# ,legend.position = 'none'
)
p
ggsave(p,
filename = str_c("~/Documents/",.y,"_proportion_VariantsGroup.pdf"),
height = 4,
width = 6)
return(p)
})
q <- ggpubr::ggarrange(plotlist=list(df[[2]],df[[1]]),
# labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 1,nrow = 2)
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
plot 不同变异类型,不同亚群的比较-分面图
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
df <- dfSNPdb_count %>%
mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>%
mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>%
group_by(GenomeType2) %>%
group_map(~{
group1 <- c("Intron","UTR_5","UTR_3","Syn","Nonsyn","Essential splice","Nonsense")
p <- .x %>%
mutate(Group = factor(Variants_Group, levels = group1)) %>%
ggplot(aes(x = GroupforExpansionLoad, y = n, fill = Variants_Group)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Accent") +
facet_wrap( ~ Variants_Group, scales = "free",ncol = 2) +
labs(x = "Subspecies", y = "Number of sites observed\nto be variable") +
scale_y_continuous(labels = scales::comma_format(big.mark = ',')) +
theme_bw() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1,size = 7))
p
ggsave(p,
filename = str_c("~/Documents/",.y,"_count_VariantsGroup.pdf"),
height = 7,
width = 7)
return(p)
})
====== Proportion in del non syn
###每个亚群的AAF
### 注意这里的AAF中,指得是DAF,全是有 ancestral 状态的位点
### 因为使用的是 ancestralVCF 计算的AAF
dftotal_withAncestral_AAF <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz")
## Rows: 3018785 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sub, LoadGroup, Group, GenomeType
## dbl (3): Chr, Pos, AAF
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftotal_withAncestral_AAF_filter1 <- dftotal_withAncestral_AAF %>%
filter(AAF!=1)
dfSNPdb_count <- dftotal_withAncestral_AAF_filter1 %>%
group_by(Group,LoadGroup,GenomeType) %>%
count() %>%
rename(GroupforExpansionLoad=Group,Variants_Group=LoadGroup)
plot derived 各种变异类型的比例
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
df <- dfSNPdb_count %>%
mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>%
mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>%
group_by(GenomeType2) %>%
group_map(~{
### 画图函数
p <- .x %>%
mutate(Variants_Group=factor(Variants_Group,levels = c("Deleterious","Nonsynonymous","Synonymous"))) %>%
ggplot(aes(x=GroupforExpansionLoad, y=n,fill=Variants_Group)) +
geom_bar(
stat = "identity",
# alpha = 1,
position = "fill",
alpha = 0.9
) +
# geom_vline(xintercept = 4.5,linetype = "solid",color = "black") +
# geom_text(aes(label= paste(prettyNum(ObservedCount,big.mark = ","))),position = position_fill(),hjust=1, size=3) +
# scale_fill_manual(values=colB)+
scale_fill_manual(values = c("#d5311c","#e69f00","#004680"))+
labs(y = "Frequency of categories by subspecies", x = "") +
coord_flip() +
theme(
panel.background = element_rect(
size = 0.7,
colour = 'black',
fill = 'white'
),
panel.grid = element_blank(),
text = element_text(size = 12)
# ,legend.position = 'none'
)
p
ggsave(p,
filename = str_c("~/Documents/",.y,"_proportion_VariantsGroup.pdf"),
height = 4,
width = 6)
return(p)
})
q <- ggpubr::ggarrange(plotlist=list(df[[2]],df[[1]]),
# labels = as.character(c(1:26)),
font.label = list(size = 7),
ncol = 1,nrow = 2)
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)
plot 不同LoadGroup变异类型,不同亚群的比较-分面图
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")
df <- dfSNPdb_count %>%
mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>%
mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>%
group_by(GenomeType2) %>%
group_map(~{
group1 <- c("Intron","UTR_5","UTR_3","Syn","Nonsyn","Essential splice","Nonsense")
p <- .x %>%
mutate(Variants_Group=factor(Variants_Group,levels = c("Deleterious","Nonsynonymous","Synonymous"))) %>%
mutate(Group = factor(Variants_Group, levels = group1)) %>%
ggplot(aes(x = GroupforExpansionLoad, y = n, fill = Variants_Group)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#d5311c","#e69f00","#004680"))+
facet_wrap( ~ Variants_Group, scales = "free",ncol = 2) +
labs(x = "Subspecies", y = "Number of sites observed\nto be variable") +
scale_y_continuous(labels = scales::comma_format(big.mark = ',')) +
theme_bw() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1,size = 8))
p
ggsave(p,
filename = str_c("~/Documents/",.y,"_count_VariantsGroup.pdf"),
height = 7,
width = 7)
return(p)
})